-1

I hope to read the contents between and in a html string.

I think it should be in objective-c

@"<title([\\s\\S]*)</title>"

below are the codes that rewrited for regular expression

//source of NSStringCategory.h
#import <Foundation/Foundation.h>
#import <regex.h>


@interface NSStringCategory:NSObject
{
    regex_t preg;
}

-(id)initWithPattern:(NSString *)pattern options:(int)options;
-(void)dealloc;

-(BOOL)matchesString:(NSString *)string;
-(NSString *)matchedSubstringOfString:(NSString *)string;
-(NSArray *)capturedSubstringsOfString:(NSString *)string;

+(NSStringCategory *)regexWithPattern:(NSString *)pattern options:(int)options;
+(NSStringCategory *)regexWithPattern:(NSString *)pattern;

+(NSString *)null;

+(void)initialize;

@end


@interface NSString (NSStringCategory)


-(BOOL)matchedByPattern:(NSString *)pattern options:(int)options;

-(BOOL)matchedByPattern:(NSString *)pattern;

-(NSString *)substringMatchedByPattern:(NSString *)pattern options:(int)options;


-(NSString *)substringMatchedByPattern:(NSString *)pattern;


-(NSArray *)substringsCapturedByPattern:(NSString *)pattern options:(int)options;


-(NSArray *)substringsCapturedByPattern:(NSString *)pattern;


-(NSString *)escapedPattern;

@end

and .m file

 #import "NSStringCategory.h"
static NSString *nullstring=nil;

@implementation NSStringCategory

-(id)initWithPattern:(NSString *)pattern options:(int)options
{
    if(self=[super init])
    {
        int err=regcomp(&preg,[pattern UTF8String],options|REG_EXTENDED);
        if(err)
        {
            char errbuf[256];
            regerror(err,&preg,errbuf,sizeof(errbuf));
            [NSException raise:@"CSRegexException"
                        format:@"Could not compile regex \"%@\": %s",pattern,errbuf];
        }
    }
    return self;
}

-(void)dealloc
{
    regfree(&preg);
    [super dealloc];
}

-(BOOL)matchesString:(NSString *)string
{
    if(regexec(&preg,[string UTF8String],0,NULL,0)==0) return YES;
    return NO;
}

-(NSString *)matchedSubstringOfString:(NSString *)string
{
    const char *cstr=[string UTF8String];
    regmatch_t match;
    if(regexec(&preg,cstr,1,&match,0)==0)
    {
        return [[[NSString alloc] initWithBytes:cstr+match.rm_so
                                         length:match.rm_eo-match.rm_so encoding:NSUTF8StringEncoding] autorelease];
    }

    return nil;
}

-(NSArray *)capturedSubstringsOfString:(NSString *)string
{
    const char *cstr=[string UTF8String];
    int num=preg.re_nsub+1;
    regmatch_t *matches=calloc(sizeof(regmatch_t),num);

    if(regexec(&preg,cstr,num,matches,0)==0)
    {
        NSMutableArray *array=[NSMutableArray arrayWithCapacity:num];

        int i;
        for(i=0;i<num;i++)
        {
            NSString *str;

            if(matches[i].rm_so==-1&&matches[i].rm_eo==-1) str=nullstring;
            else str=[[[NSString alloc] initWithBytes:cstr+matches[i].rm_so
                                               length:matches[i].rm_eo-matches[i].rm_so encoding:NSUTF8StringEncoding] autorelease];

            [array addObject:str];
        }

        free(matches);

        return [NSArray arrayWithArray:array];
    }

    free(matches);

    return nil;
}

+(NSStringCategory *)regexWithPattern:(NSString *)pattern options:(int)options
{ return [[[NSStringCategory alloc] initWithPattern:pattern options:options] autorelease]; }

+(NSStringCategory *)regexWithPattern:(NSString *)pattern
{ return [[[NSStringCategory alloc] initWithPattern:pattern options:0] autorelease]; }

+(NSString *)null { return nullstring; }

+(void)initialize
{
    if(!nullstring) nullstring=[[NSString alloc] initWithString:@""];
}

@end

@implementation NSString (NSStringCategory)

-(BOOL)matchedByPattern:(NSString *)pattern options:(int)options
{
    NSStringCategory *re=[NSStringCategory regexWithPattern:pattern options:options|REG_NOSUB];
    return [re matchesString:self];
}

-(BOOL)matchedByPattern:(NSString *)pattern
{ return [self matchedByPattern:pattern options:0]; }

-(NSString *)substringMatchedByPattern:(NSString *)pattern options:(int)options
{
    NSStringCategory *re=[NSStringCategory regexWithPattern:pattern options:options];
    return [re matchedSubstringOfString:self];
}

-(NSString *)substringMatchedByPattern:(NSString *)pattern
{ return [self substringMatchedByPattern:pattern options:0]; }

-(NSArray *)substringsCapturedByPattern:(NSString *)pattern options:(int)options
{
    NSStringCategory *re=[NSStringCategory regexWithPattern:pattern options:options];
    return [re capturedSubstringsOfString:self];
}

-(NSArray *)substringsCapturedByPattern:(NSString *)pattern
{ return [self substringsCapturedByPattern:pattern options:0]; }

-(NSString *)escapedPattern
{
    int len=[self length];
    NSMutableString *escaped=[NSMutableString stringWithCapacity:len];

    for(int i=0;i<len;i++)
    {
        unichar c=[self characterAtIndex:i];
        if(c=='^'||c=='.'||c=='['||c=='$'||c=='('||c==')'
           ||c=='|'||c=='*'||c=='+'||c=='?'||c=='{'||c=='\\') [escaped appendFormat:@"\\%C",c];
        else [escaped appendFormat:@"%C",c];
    }
    return [NSString stringWithString:escaped];
}



@end

I use the codes below to get the string between "" and ""

NSStringCategory *a=[[NSStringCategory alloc] initWithPattern:@"<title([\s\S]*)</title>" options:0];//

Unfortunately [a matchedSubstringOfString:response]] always returns nil

I do not if the regular expression is wrong or any other reason.

Welcome any comment

Thanks

interdev

arachide
  • 8,006
  • 18
  • 71
  • 134
  • 2
    Can you make the assumption that the web page is valid HTML? – Gareth Mar 30 '10 at 14:40
  • 1
    By the way, whatever the answer to my question, the follow-up will be "in that case, you don't want to use a regular expression" – Gareth Mar 30 '10 at 14:44

3 Answers3

3

(Preliminary warning: you can't parse HTML correctly with Regex.)


You are using regex.h, which provides POSIX regular expression (ERE in your case). They do not support all of the PCRE syntax such as \s and \S (and [\s\S] is useless anyway — it matches anything).

Probably you should use

initWithPattern:@"<title[^>]*>([^<]*)</title>" options:REG_ICASE
Community
  • 1
  • 1
kennytm
  • 510,854
  • 105
  • 1,084
  • 1,005
1

<title[^>]*>\([^<]*\)</title> should do the trick.

Tomislav Nakic-Alfirevic
  • 10,017
  • 5
  • 38
  • 51
0

For this specific case, I might try instantiating a WebDocumentRepresentation object from the /System/Library/Frameworks/WebKit framework.

You could set the data source for the WebDocumentRepresentation object to the HTML page you were interested in, and then use the object's title method to return the title.

Here's the Mac OSX Reference Library document on the object.

Stephen Harmon
  • 835
  • 2
  • 9
  • 15